rm(list=ls(all=TRUE))
pacman::p_load(vcd, magrittr, readr, caTools, ggplot2, dplyr, plotly,tidyverse,ggrepel)
load("data/tf0.rdata")
load("data/tf3.rdata")
load("data/tf4.rdata")
load("data/tf5.rdata")
Fig-1:行銷流程

探索TF資料

#了解資料數量(顧客數、商品數...)
sapply(list(cust=A0,tid=X0,items=Z0), nrow)
##   cust    tid  items 
##  32241 119328 817182
#4個月內來店次數
A0 <- A0 %>% group_by(cust) %>% mutate(Freq = sum(f)) 

#讓分佈圖看起來更清楚
A1 <- A0 %>% filter(Freq < 31) 
總購買次數分佈圓餅圖
A1$Freq <- as.factor(A1$Freq)
pie <- ggplot(A1,
  aes(x = factor(1), fill = Freq)) +
  geom_bar(width = 1) #將資料用成百分比
pie <- pie + coord_polar(theta = "y")
pie <- pie + theme_void()#將數值去掉
pie


透過圓餅圖了解顧客這四個月到店的消費次數,發現超過一半的人只來1.2次而已

分群
m = A0$m > 993; f = A0$f >= 3; a = A0$area %in% c("z115","z221") 
Status2 = case_when(
  m & f  ~  "A",   #購買金額高次數高
  !m & !f ~ "D",   #購買金額低次數低
  !m & f & !a ~ "E",   #購買金額低次數高不住附近
  !m & f & a ~ "F",    #購買金額低次數高住附近
  m & !f & !a ~ "B",  #購買金額高次數低不住附近
  m & !f & a ~ "C"   #購買金額高次數低住附近
  )

Status3 = case_when(
    m & !f  ~  "a1",   #潛力暴發戶
  !m & f ~ "a2")   #忠心小犬

table(Status2)
## Status2
##     A     B     C     D     E     F 
##  4431  3808  3160 11223  2417  7202
table(Status3)
## Status3
##   a1   a2 
## 6968 9619
#把分群納入資料框中
A0<-cbind(A0,Status2)
## New names:
## * NA -> ...11
names(A0)[11] <- "status"
#將資料合併
B0 = X0[,c("cust","date")]
AB <- left_join(A0,B0)
## Joining, by = "cust"
分群分佈圖
#把日期轉換成月份
AB$month <- format(as.Date(AB$date),format="%Y%m")
g2 = ggplot(AB,aes(x = month, fill = status)) +
     geom_bar()
ggplotly(g2)
#把A與Z合併
AZ <- left_join(A0,Z0)
## Joining, by = c("cust", "age", "area")
各分群平均購買次數、客單價分布
CustSegments = AB %>%
  group_by(month,status) %>% summarise(
    average_frequency = mean(f),
    average_amount = mean(m),
    total_revenue = sum(rev),
    total_no_orders = sum(Freq),
    average_recency = mean(r),
    average_seniority = mean(s),
    group_size = n())
## `summarise()` has grouped output by 'month'. You can override using the `.groups` argument.
df = CustSegments %>% transmute(
  `群組` = as.character(status), 
  `month` = month, 
  `平均購買次數` = average_frequency, 
  `平均客單價` = average_amount,
  `總營收貢獻` = total_revenue
  )
ggplot(df, aes(
    x=`平均購買次數`,y=`平均客單價`,color=`群組`,group=`群組`,ids=month)) +
  geom_point(aes(size=`總營收貢獻`),alpha=0.8) +
  scale_size(range=c(2,12)) -> g
ggplotly(g)


透些資料探索我們可以發現一些趨勢和需改善的問題,
我們在分群後,將主要行銷對象設定為忠心小犬(E&F)及潛力暴發戶(B&C),
提高此2群顧客的消費頻率及平均客單價,為大豐創造更多利潤

TF資料切割

feb01 = as.Date("2001-02-01")
Z = subset(Z0, date < feb01)    # 618212
交易記錄彙總
#依據分析對象彙整資料
X = group_by(Z, tid) %>% summarise(
  date = first(date),  # 交易日期
  cust = first(cust),  # 顧客 ID
  age = first(age),    # 顧客 年齡級別
  area = first(area),  # 顧客 居住區別
  items = n(),                # 交易項目(總)數
  pieces = sum(qty),          # 產品(總)件數
  total = sum(price),         # 交易(總)金額
  gross = sum(price - cost)   # 毛利
  ) %>% data.frame  # 88387
處理異常值
sapply(X[,6:9], quantile, prob=c(.999, .9995, .9999))
##          items   pieces     total    gross
## 99.9%  56.0000  84.0000  9378.684 1883.228
## 99.95% 64.0000  98.0000 11261.751 2317.087
## 99.99% 85.6456 137.6456 17699.325 3389.646
X = subset(X, items<=64 & pieces<=98 & total<=11260) # 88387 -> 88295
顧客資料彙總
d0 = max(X$date) + 1
A = X %>% mutate(
  days = as.integer(difftime(d0, date, units="days"))
  ) %>% 
  group_by(cust) %>% summarise(
    r = min(days),      # recency
    s = max(days),      # seniority
    f = n(),            # frquency
    m = mean(total),    # monetary
    rev = sum(total),   # total revenue contribution
    raw = sum(gross),   # total gross profit contribution
    age = age[1],       # age group
    area = area[1],     # area code
  ) %>% data.frame      # 28584
nrow(A)
## [1] 28584
照顧客彙總2月交易
feb = filter(X0, date>= feb01) %>% group_by(cust) %>% 
  summarise(amount = sum(total))  # 16900
The Target for Regression - A$amount
A = merge(A, feb, by="cust", all.x=T)
The Target for Classification - A$buy
A$buy = !is.na(A$amount)
table(A$buy, !is.na(A$amount))
##        
##         FALSE  TRUE
##   FALSE 15342     0
##   TRUE      0 13242
summary(A)
##      cust                 r               s               f         
##  Length:28584       Min.   : 1.00   Min.   : 1.00   Min.   : 1.000  
##  Class :character   1st Qu.:11.00   1st Qu.:47.00   1st Qu.: 1.000  
##  Mode  :character   Median :21.00   Median :68.00   Median : 2.000  
##                     Mean   :32.12   Mean   :61.27   Mean   : 3.089  
##                     3rd Qu.:53.00   3rd Qu.:83.00   3rd Qu.: 4.000  
##                     Max.   :92.00   Max.   :92.00   Max.   :60.000  
##                                                                     
##        m                rev             raw              age           
##  Min.   :    8.0   Min.   :    8   Min.   : -742.0   Length:28584      
##  1st Qu.:  359.4   1st Qu.:  638   1st Qu.:   70.0   Class :character  
##  Median :  709.5   Median : 1566   Median :  218.0   Mode  :character  
##  Mean   : 1012.4   Mean   : 2711   Mean   :  420.8                     
##  3rd Qu.: 1315.0   3rd Qu.: 3426   3rd Qu.:  535.0                     
##  Max.   :10634.0   Max.   :99597   Max.   :15565.0                     
##                                                                        
##      area               amount         buy         
##  Length:28584       Min.   :    8   Mode :logical  
##  Class :character   1st Qu.:  454   FALSE:15342    
##  Mode  :character   Median :  993   TRUE :13242    
##                     Mean   : 1499                  
##                     3rd Qu.: 1955                  
##                     Max.   :28089                  
##                     NA's   :15342
依客群分資料集
LoyalDog <-A %>% filter(m<993 & f>=3) #購買金額低 購買次數多 忠心小犬
Upstart <-A %>% filter(m>993 & f<3) #購買金額高 購買次數低 潛力暴發戶
Train & Test Dataset-忠心小犬LoyalDog
LoyalDogX = subset(X, cust %in% LoyalDog$cust & date < as.Date("2001-02-01"))
LoyalDogZ = subset(Z, cust %in% LoyalDog$cust & date < as.Date("2001-02-01"))
set.seed(2018); LoyalDogspl = sample.split(LoyalDog$buy, SplitRatio=0.7)#sample.split回傳分割向量 得出train跟test
c(nrow(LoyalDog), sum(LoyalDogspl), sum(!LoyalDogspl))
## [1] 7366 5156 2210
cbind(LoyalDog, LoyalDogspl) %>% filter(buy) %>% 
  ggplot(aes(x=log(amount))) + geom_density(aes(fill=LoyalDogspl), alpha=0.5)


檢查忠心小犬在購買機率的分佈情形,從圖型可看出兩者間存在些許差異

LoyalDog2 = subset(LoyalDog, buy) %>% mutate_at(c("m","rev","amount"), log10)
n = nrow(LoyalDog2)
set.seed(2018); LoyalDogspl2 = 1:n %in% sample(1:n, round(0.7*n))
c(nrow(LoyalDog2), sum(LoyalDogspl2), sum(!LoyalDogspl2))
## [1] 5430 3801 1629
cbind(LoyalDog2, LoyalDogspl2) %>% 
  ggplot(aes(x=amount)) + geom_density(aes(fill=LoyalDogspl2), alpha=0.5)


檢查忠心小犬在購買機率的分佈情形,從圖型可看出兩者間存在些許差異

Train & Test Dataset-潛力暴發戶Upstart
UpstartX = subset(X, cust %in% Upstart$cust & date < as.Date("2001-02-01"))
UpstartZ = subset(Z, cust %in% Upstart$cust & date < as.Date("2001-02-01"))
set.seed(2018); Upstartspl = sample.split(Upstart$buy, SplitRatio=0.7)
#sample.split回傳分割向量 得出train跟test
c(nrow(Upstart), sum(Upstartspl), sum(!Upstartspl))
## [1] 6986 4890 2096
cbind(Upstart, Upstartspl) %>% filter(buy) %>% 
  ggplot(aes(x=log(amount))) + geom_density(aes(fill=Upstartspl), alpha=0.5)


檢查潛在暴發戶在購買機率的分佈情形,從圖型可看出兩者間存在些許差異

Upstart2 = subset(Upstart, buy) %>% mutate_at(c("m","rev","amount"), log10)
n = nrow(Upstart2)
set.seed(2018); Upstartspl2 = 1:n %in% sample(1:n, round(0.7*n))
c(nrow(Upstart2), sum(Upstartspl2), sum(!Upstartspl2))
## [1] 2115 1480  635
cbind(Upstart2, Upstartspl2) %>% 
  ggplot(aes(x=amount)) + geom_density(aes(fill=Upstartspl2), alpha=0.5)


檢查潛在暴發戶在購買機率的分佈情形,從圖型可看出兩者間存在些許差異

TF建立模型

Spliting for Classification-忠心小犬
LoyalDogTR = subset(LoyalDog, LoyalDogspl)
LoyalDogTS = subset(LoyalDog, !LoyalDogspl)
Classification Model-忠心小犬
LoyalDogglm1 = glm(buy ~ ., LoyalDogTR[,c(2:9, 11)], family=binomial()) 
summary(LoyalDogglm1)
## 
## Call:
## glm(formula = buy ~ ., family = binomial(), data = LoyalDogTR[, 
##     c(2:9, 11)])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.3539  -1.0687   0.5654   0.8392   1.6185  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -5.041e-01  3.661e-01  -1.377 0.168560    
## r            -1.824e-02  2.071e-03  -8.808  < 2e-16 ***
## s             8.277e-03  2.269e-03   3.648 0.000264 ***
## f             1.641e-01  3.735e-02   4.393 1.12e-05 ***
## m            -7.993e-04  3.373e-04  -2.370 0.017783 *  
## rev           1.952e-04  7.982e-05   2.446 0.014462 *  
## raw          -4.185e-04  2.180e-04  -1.920 0.054880 .  
## agea29        3.454e-01  1.735e-01   1.991 0.046431 *  
## agea34        2.349e-01  1.600e-01   1.468 0.142061    
## agea39        3.301e-01  1.568e-01   2.106 0.035227 *  
## agea44        3.220e-01  1.621e-01   1.987 0.046906 *  
## agea49        3.504e-01  1.664e-01   2.105 0.035259 *  
## agea54        4.108e-02  1.785e-01   0.230 0.818019    
## agea59        1.292e-01  2.193e-01   0.589 0.555668    
## agea64        2.586e-01  2.253e-01   1.148 0.251034    
## agea69        3.749e-01  1.886e-01   1.988 0.046866 *  
## agea99       -1.329e-02  2.874e-01  -0.046 0.963131    
## areaz106      1.668e-01  3.665e-01   0.455 0.649046    
## areaz110     -4.612e-02  2.935e-01  -0.157 0.875146    
## areaz114     -8.860e-02  3.059e-01  -0.290 0.772125    
## areaz115      2.629e-01  2.651e-01   0.992 0.321387    
## areaz221      1.821e-01  2.670e-01   0.682 0.495214    
## areazOthers   2.534e-01  2.890e-01   0.877 0.380669    
## areazUnknown -2.210e-01  3.021e-01  -0.731 0.464510    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5939.4  on 5155  degrees of freedom
## Residual deviance: 5252.2  on 5132  degrees of freedom
## AIC: 5300.2
## 
## Number of Fisher Scoring iterations: 6
LoyalDogpred =  predict(LoyalDogglm1, LoyalDogTS, type="response")
LoyalDogcm = table(actual = LoyalDogTS$buy, predict = LoyalDogpred > 0.5); LoyalDogcm
##        predict
## actual  FALSE TRUE
##   FALSE    72  509
##   TRUE     74 1555
LoyalDogacc.ts = LoyalDogcm %>% {sum(diag(.))/sum(.)}
c(1-mean(LoyalDogTS$buy) , LoyalDogacc.ts)  
## [1] 0.2628959 0.7361991


用模型後,正確率從0.26提升到0.73

colAUC(LoyalDogpred, LoyalDogTS$buy)        # 0.6940807
##                     [,1]
## FALSE vs. TRUE 0.6940807


Regression Model-忠心小犬
LoyalDog2 = subset(LoyalDog, LoyalDog$buy) %>% mutate_at(c("m","rev","amount"), log10)
LoyalDogTR2 = subset(LoyalDog2, LoyalDogspl2)
LoyalDogTS2 = subset(LoyalDog2, !LoyalDogspl2)
LoyalDoglm1 = lm(amount ~ ., LoyalDogTR2[,c(2:6,8:10)])
summary(LoyalDoglm1)
## 
## Call:
## lm(formula = amount ~ ., data = LoyalDogTR2[, c(2:6, 8:10)])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.88216 -0.20627  0.04979  0.26805  1.28653 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   6.579e-01  1.065e-01   6.179 7.14e-10 ***
## r            -5.668e-05  5.211e-04  -0.109 0.913381    
## s            -4.758e-04  5.375e-04  -0.885 0.376087    
## f             2.076e-02  2.850e-03   7.286 3.88e-13 ***
## m             5.485e-01  7.562e-02   7.254 4.88e-13 ***
## rev           1.672e-01  7.058e-02   2.369 0.017892 *  
## agea29        6.640e-02  3.649e-02   1.820 0.068912 .  
## agea34        8.172e-02  3.381e-02   2.417 0.015678 *  
## agea39        1.242e-01  3.291e-02   3.774 0.000163 ***
## agea44        1.087e-01  3.357e-02   3.238 0.001212 ** 
## agea49        8.509e-02  3.460e-02   2.459 0.013981 *  
## agea54        4.283e-02  3.760e-02   1.139 0.254799    
## agea59        1.509e-02  4.490e-02   0.336 0.736851    
## agea64       -3.931e-03  4.609e-02  -0.085 0.932037    
## agea69       -3.131e-02  3.883e-02  -0.806 0.420165    
## agea99        8.588e-02  5.725e-02   1.500 0.133700    
## areaz106      3.940e-02  8.821e-02   0.447 0.655118    
## areaz110      2.933e-02  7.420e-02   0.395 0.692627    
## areaz114      4.295e-02  7.791e-02   0.551 0.581452    
## areaz115      5.354e-02  6.755e-02   0.793 0.428055    
## areaz221      5.556e-02  6.789e-02   0.818 0.413227    
## areazOthers   1.521e-02  7.234e-02   0.210 0.833522    
## areazUnknown  6.502e-02  7.461e-02   0.871 0.383568    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4101 on 3778 degrees of freedom
## Multiple R-squared:  0.2944, Adjusted R-squared:  0.2903 
## F-statistic: 71.66 on 22 and 3778 DF,  p-value: < 2.2e-16
LoyalDogr2.tr = summary(LoyalDoglm1)$r.sq
LoyalDogSST = sum((LoyalDogTS2$amount - mean(LoyalDogTR2$amount))^ 2)
LoyalDogSSE = sum((predict(LoyalDoglm1, LoyalDogTS2) -  LoyalDogTS2$amount)^2)
LoyalDogr2.ts = 1 - (LoyalDogSSE/LoyalDogSST)
c(LoyalDogR2train=LoyalDogr2.tr, LoyalDogR2test=LoyalDogr2.ts)
## LoyalDogR2train  LoyalDogR2test 
##       0.2944228       0.2814886
Spliting for Classification-潛力暴發戶
UpstartTR = subset(Upstart, Upstartspl)
UpstartTS = subset(Upstart, !Upstartspl)
Classification Model-潛力暴發戶
Upstartglm1 = glm(buy ~ ., UpstartTR[,c(2:9, 11)], family=binomial()) 
summary(Upstartglm1)
## 
## Call:
## glm(formula = buy ~ ., family = binomial(), data = UpstartTR[, 
##     c(2:9, 11)])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.4070  -0.8118  -0.7207   1.2682   2.0709  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -2.272e+00  3.348e-01  -6.786 1.15e-11 ***
## r            -4.551e-03  2.300e-03  -1.979  0.04787 *  
## s             2.898e-03  2.334e-03   1.242  0.21429    
## f             8.174e-01  1.585e-01   5.156 2.53e-07 ***
## m             8.995e-05  8.429e-05   1.067  0.28587    
## rev          -4.111e-05  6.870e-05  -0.598  0.54962    
## raw          -1.794e-04  1.521e-04  -1.179  0.23831    
## agea29        1.049e-01  2.274e-01   0.461  0.64469    
## agea34        1.934e-01  2.141e-01   0.903  0.36635    
## agea39        2.741e-01  2.120e-01   1.293  0.19601    
## agea44        2.516e-01  2.154e-01   1.168  0.24277    
## agea49        3.478e-01  2.219e-01   1.568  0.11698    
## agea54        1.328e-01  2.410e-01   0.551  0.58164    
## agea59        4.141e-01  2.676e-01   1.547  0.12183    
## agea64        2.866e-01  2.943e-01   0.974  0.33009    
## agea69        7.772e-01  2.838e-01   2.738  0.00617 ** 
## agea99        6.528e-01  3.359e-01   1.943  0.05197 .  
## areaz106     -4.957e-02  2.233e-01  -0.222  0.82432    
## areaz110     -1.015e-01  1.763e-01  -0.576  0.56492    
## areaz114      7.368e-02  1.908e-01   0.386  0.69932    
## areaz115      2.392e-01  1.668e-01   1.434  0.15147    
## areaz221      2.253e-01  1.670e-01   1.349  0.17740    
## areazOthers   2.372e-02  1.772e-01   0.134  0.89350    
## areazUnknown -5.492e-01  2.446e-01  -2.245  0.02478 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5996.1  on 4889  degrees of freedom
## Residual deviance: 5783.7  on 4866  degrees of freedom
## AIC: 5831.7
## 
## Number of Fisher Scoring iterations: 4
Upstartpred =  predict(Upstartglm1, UpstartTS, type="response")
Upstartcm = table(actual = UpstartTS$buy, predict = Upstartpred > 0.5); Upstartcm
##        predict
## actual  FALSE TRUE
##   FALSE  1439   22
##   TRUE    615   20
Upstartacc.ts = Upstartcm %>% {sum(diag(.))/sum(.)}
c(1-mean(UpstartTS$buy) , Upstartacc.ts)  # 用模型後的正確率
## [1] 0.6970420 0.6960878
colAUC(Upstartpred, UpstartTS$buy)        # 0.6173465
##                     [,1]
## FALSE vs. TRUE 0.6173465
Regression Model-潛力暴發戶
Upstart2 = subset(Upstart, Upstart$buy) %>% mutate_at(c("m","rev","amount"), log10)
UpstartTR2 = subset(Upstart2, Upstartspl2)
UpstartTS2 = subset(Upstart2, !Upstartspl2)
Upstartlm1 = lm(amount ~ ., UpstartTR2[,c(2:5,8:10)])
summary(Upstartlm1)
## 
## Call:
## lm(formula = amount ~ ., data = UpstartTR2[, c(2:5, 8:10)])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.5957 -0.2129  0.0606  0.2884  1.0700 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   0.9568564  0.2101547   4.553 5.73e-06 ***
## r             0.0002358  0.0007470   0.316   0.7524    
## s             0.0001052  0.0007586   0.139   0.8897    
## f             0.0629430  0.0331501   1.899   0.0578 .  
## m             0.6138312  0.0555493  11.050  < 2e-16 ***
## agea29       -0.0158818  0.0782064  -0.203   0.8391    
## agea34        0.0230267  0.0718094   0.321   0.7485    
## agea39        0.1058886  0.0714651   1.482   0.1386    
## agea44        0.0812381  0.0727503   1.117   0.2643    
## agea49       -0.0209388  0.0742414  -0.282   0.7780    
## agea54        0.0087930  0.0816758   0.108   0.9143    
## agea59        0.0738311  0.0899099   0.821   0.4117    
## agea64       -0.0334659  0.0969835  -0.345   0.7301    
## agea69       -0.0654049  0.0940655  -0.695   0.4870    
## agea99        0.1521789  0.1166531   1.305   0.1923    
## areaz106     -0.0154511  0.0749086  -0.206   0.8366    
## areaz110      0.0075053  0.0596472   0.126   0.8999    
## areaz114     -0.0612669  0.0638730  -0.959   0.3376    
## areaz115     -0.0749400  0.0555279  -1.350   0.1774    
## areaz221     -0.0334140  0.0554566  -0.603   0.5469    
## areazOthers   0.0303755  0.0597953   0.508   0.6115    
## areazUnknown -0.1239351  0.0863085  -1.436   0.1512    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4099 on 1458 degrees of freedom
## Multiple R-squared:  0.1083, Adjusted R-squared:  0.09542 
## F-statistic: 8.429 on 21 and 1458 DF,  p-value: < 2.2e-16
Upstartr2.tr = summary(Upstartlm1)$r.sq
UpstartSST = sum((UpstartTS2$amount - mean(UpstartTR2$amount))^ 2)
UpstartSSE = sum((predict(Upstartlm1, UpstartTS2) -  UpstartTS2$amount)^2)
Upstartr2.ts = 1 - (UpstartSSE/UpstartSST)
c(UpstartR2train=Upstartr2.tr, UpstartR2test=Upstartr2.ts)
## UpstartR2train  UpstartR2test 
##     0.10826081     0.03312782

TF預測(購買機率與預期營收)

資料日期為2000-12-01 ~ 2001~02-28

忠心小犬
d0 = max(X0$date) + 1
B = X0 %>% 
  filter(date >= as.Date("2000-12-01")) %>% 
  mutate(days = as.integer(difftime(d0, date, units="days"))) %>% 
  group_by(cust) %>% summarise(
    r = min(days),      # recency
    s = max(days),      # seniority
    f = n(),            # frquency
    m = mean(total),    # monetary
    rev = sum(total),   # total revenue contribution
    raw = sum(gross),   # total gross profit contribution
    age = age[1],       # age group
    area = area[1],     # area code
  ) %>% data.frame      # 28584

LoyalDogB <-B %>% filter(m<993 & f>=3) #購買金額低 購買次數多 忠心小犬
LoyalDogB$Buy = predict(LoyalDogglm1, LoyalDogB, type="response")
LoyalDogB2 = LoyalDogB %>% mutate_at(c("m","rev"), log10)
LoyalDogB$Rev = 10^predict(LoyalDoglm1, LoyalDogB2)

對忠心小犬客群的預測購買金額做指數、對數轉換

par(mfrow=c(1,2), cex=0.8)
hist(LoyalDogB$Buy)
hist(log(LoyalDogB$Rev,10))


從上圖可發現,忠心小犬的購買機率落在0.7~0.9之間(該族群的購買機率確實偏高)
而在預期營收方面,呈現常態分佈,預期營收平均約落在899左右

潛力暴發戶
UpstartB <-B %>% filter(m>993 & f<3) #購買金額高 購買次數低 潛力暴發戶
UpstartB$Buy = predict(Upstartglm1, UpstartB, type="response")
UpstartB2 = UpstartB %>% mutate_at(c("m","rev"), log10)
UpstartB$Rev = 10^predict(Upstartlm1, UpstartB2)

對忠心小犬客群的預測購買金額做指數、對數轉換

par(mfrow=c(1,2), cex=0.8)
hist(UpstartB$Buy)
hist(log(UpstartB$Rev,10))


從上圖可發現,潛力潛力暴發戶的購買機率較為分散,與忠心小犬相比其購買機率明顯較低
而在預期營收方面,呈現常態分佈,預期營收平均約落在1252左右

市場模擬


接著我們要做出具體的行銷方向,再透過假設進行市場模擬,
了解出這些行銷方案在忠心小犬與潛力暴發戶上到底有沒有效

成本效益函數
DP = function(x,m0,b0,a0) {m0*plogis((10/a0)*(x-b0))}
估計毛利率(margin)
marLoyalDog = sum(LoyalDogB$raw)/sum(LoyalDogB$rev)
marUpstart = sum(UpstartB$raw)/sum(UpstartB$rev)

我們針對兩族群進行毛利率估計後,發現忠心小犬(loyaldog)為0.14;
潛力暴發戶(upstart)則為0.17

估計預期報償-忠心小犬
m=0.2; b=25; a=40; x=30
LoyalDogdp = pmin(1-LoyalDogB$Buy, DP(x,m,b,a))
LoyalDogeR = LoyalDogdp*LoyalDogB$Rev*marLoyalDog - x
hist(LoyalDogeR,main="預期報償分佈",xlab="預期報償",ylab="顧客數")


針對LoyalDog這個族群分析預期報償,發現約落在-30到10之間,
於是我們根據上圖所得的結果,針對預期報償大於0之顧客進行促銷

單一參數組合-忠心小犬
m=0.2; b=25; a=40; X = seq(10,45,1)

LoyalDogdf = sapply(X, function(x) {
  LoyalDogdp = pmin(DP(x,m,b,a),1-LoyalDogB$Buy)
  LoyalDogeR = LoyalDogdp*LoyalDogB$Rev*marLoyalDog - x
  c(x=x, eReturn=sum(LoyalDogeR), N=sum(LoyalDogeR > 0), eReturn2=sum(LoyalDogeR[LoyalDogeR > 0]))
  }) %>% t %>% data.frame 
#eReturn對所有人行銷的總預期效益
#n 預期收益大於0的人數
#eReturn 只對收益大於0的人做行銷的總預期收益
LoyalDogdf %>% gather('key','value',-x) %>% 
  ggplot(aes(x=x, y=value, col=key)) + 
  geom_hline(yintercept=0,linetype='dashed') +
  geom_line(size=1.5,alpha=0.5) + 
  facet_wrap(~key,ncol=1,scales='free_y') + theme_bw()


針對該族群的所有人進行行銷,結果會發現總體預期報償會呈現負數,
然而在進行調整過後,發現針對預期報償大於0之顧客的總體預期報償則明顯提高

市場模擬:不同的參數組合的比較-忠心小犬
mm=c(0.20, 0.25, 0.15, 0.25)
bb=c(  25,   30,   15,   30)
aa=c(  40,   40,   30,   60) 
X = seq(10, 60, 1) 
LoyalDogdf2 = do.call(rbind, lapply(1:length(mm), function(i) {
  sapply(X, function(x) {
    LoyalDogdp2 = pmin(1-LoyalDogB$Buy, DP(x,mm[i],bb[i],aa[i]))
    LoyalDogeR2 = LoyalDogdp2*LoyalDogB$Rev*marLoyalDog - x
    c(i=i, x=x, eR.ALL=sum(LoyalDogeR2), N=sum(LoyalDogeR2>0), eR.SEL=sum(LoyalDogeR2[LoyalDogeR2 > 0]) )
    }) %>% t %>% data.frame
  })) 

LoyalDogdf2 %>% 
  mutate_at(vars(eR.ALL, eR.SEL), function(y) round(y/1000)) %>% 
  gather('key','value',-i,-x) %>% 
  mutate(Instrument = paste0('I',i)) %>%
  ggplot(aes(x=x, y=value, col=Instrument)) + 
  geom_hline(yintercept=0, linetype='dashed', col='blue') +
  geom_line(size=1.5,alpha=0.5) + 
  xlab('工具選項(成本)') + ylab('預期報償(K)') + 
  ggtitle('行銷工具優化','假設行銷工具的效果是其成本的函數') +
    facet_wrap(~key,ncol=1,scales='free_y') + theme_bw() -> p

plotly::ggplotly(p)


從圖中可發現行銷工具中,I3的預期報償表現顯著突出
當工具成本20時,對所有人的預期效益會是-583310,
目標客群人數為791,然而僅針對部分顧客進行行銷時為2046

每一個工具的最佳參數-忠心小犬
group_by(LoyalDogdf2, i) %>% top_n(1,eR.SEL)
## # A tibble: 4 x 5
## # Groups:   i [4]
##       i     x   eR.ALL     N eR.SEL
##   <dbl> <dbl>    <dbl> <dbl>  <dbl>
## 1     1    32 -124049.   109   218.
## 2     2    38 -145268.   191   472.
## 3     3    20  -58331.   791  2047.
## 4     4    40 -165002.    48   104.

從工具的最佳參數中,可看出I3的效果最好,
因此對Loyaldog這個族群,我們以I3作為行銷工具。

估計預期報償-潛力暴發戶
m=0.2; b=25; a=40; x=30
Upstartdp = pmin(1-UpstartB$Buy, DP(x,m,b,a))
UpstarteR = Upstartdp*UpstartB$Rev*marUpstart - x
hist(UpstarteR,main="預期報償分佈",xlab="預期報償",ylab="顧客數")


針對Upstart這個族群分析預期報償,發現約落在-20到80之間,
於是我們根據上圖所得的結果,針對預期報償大於0之顧客進行促銷

單一參數組合-潛力暴發戶
m=0.2; b=25; a=40; X = seq(10,45,1)

Upstartdf = sapply(X, function(x) {
  Upstartdp = pmin(DP(x,m,b,a),1-UpstartB$Buy)
  UpstarteR = Upstartdp*UpstartB$Rev*marUpstart - x
  c(x=x, eReturn=sum(UpstarteR), N=sum(UpstarteR > 0), eReturn2=sum(UpstarteR[UpstarteR > 0]))
  }) %>% t %>% data.frame 
#eReturn對所有人行銷的總預期效益
#n 預期收益大於0的人數
#eReturn 只對收益大於0的人做行銷的總預期收益
Upstartdf %>% gather('key','value',-x) %>% 
  ggplot(aes(x=x, y=value, col=key)) + 
  geom_hline(yintercept=0,linetype='dashed') +
  geom_line(size=1.5,alpha=0.5) + 
  facet_wrap(~key,ncol=1,scales='free_y') + theme_bw()


針對該族群的所有人進行行銷,結果會發現總體預期報償的區間大於忠心小犬的區間,
然而在進行調整過後,發現針對預期報償大於0之顧客的總體預期報償則明顯提高

市場模擬:不同的參數組合的比較-潛力暴發戶
X = seq(10, 60, 1) 
Upstartdf2 = do.call(rbind, lapply(1:length(mm), function(i) {
  sapply(X, function(x) {
    Upstartdp2 = pmin(1-UpstartB$Buy, DP(x,mm[i],bb[i],aa[i]))
    UpstarteR2 = Upstartdp2*UpstartB$Rev*marUpstart - x
    c(i=i, x=x, eR.ALL=sum(UpstarteR2), N=sum(UpstarteR2>0), eR.SEL=sum(UpstarteR2[UpstarteR2 > 0]) )
    }) %>% t %>% data.frame
  })) 

Upstartdf2 %>% 
  mutate_at(vars(eR.ALL, eR.SEL), function(y) round(y/1000)) %>% 
  gather('key','value',-i,-x) %>% 
  mutate(Instrument = paste0('I',i)) %>%
  ggplot(aes(x=x, y=value, col=Instrument)) + 
  geom_hline(yintercept=0, linetype='dashed', col='blue') +
  geom_line(size=1.5,alpha=0.5) + 
  xlab('工具選項(成本)') + ylab('預期報償(K)') + 
  ggtitle('行銷工具優化','假設行銷工具的效果是其成本的函數') +
    facet_wrap(~key,ncol=1,scales='free_y') + theme_bw() -> p
#er.all所有人都做
#er.sel  er>0的人中 對他們做行銷的er
#er.n er>0的人數

plotly::ggplotly(p)

從圖中可發現行銷工具中,每個工具所對應的成本、預期報償皆不同,
而其中I2的預期報償表現稍顯突出
當工具成本40時,對所有人的預期效益會是62608,
目標客群人數為4376,然而僅針對部分顧客進行行銷時78269

另外,我們也可以透過行銷模擬工具來調整參數,
找出最適合的行銷工具(可參考G10_FinalSimulate.rmd)
Fig-2:行銷模擬工具
每一個工具的最佳參數-潛力暴發戶
group_by(Upstartdf2, i) %>% top_n(1,eR.SEL)
## # A tibble: 4 x 5
## # Groups:   i [4]
##       i     x eR.ALL     N eR.SEL
##   <dbl> <dbl>  <dbl> <dbl>  <dbl>
## 1     1    35 29692.  3609 50655.
## 2     2    40 62608.  4376 78269.
## 3     3    22 48437.  4920 54118.
## 4     4    43 32467.  3512 59605.


從工具的最佳參數中,可看出I2的效果最好,
因此對upstart這個族群,我們以I2作為行銷工具

save(LoyalDogZ, LoyalDogX, LoyalDog, LoyalDog2, LoyalDogspl, LoyalDogspl2,UpstartZ, UpstartX, Upstart, Upstart2, Upstartspl, Upstartspl2, file="data/tf3.rdata")
save(LoyalDogB, UpstartB, file='data/tf4.rdata')
save(AB, AZ, Status2, file="data/tf5.rdata")